---
title: 'Data prepare'
author: "David Hume"
date: '`r format(Sys.time(), "%d %B %Y")`'
output: 
   html_document:
    toc: true
    toc_depth: 3 
editor_options: 
  chunk_output_type: console
---

```{r setup, message=FALSE}
library(tidyverse)
library(data.table)
library(Hmisc)
library(fasttime)
library(lubridate)
library(stringr)
library(car)
library(zoo)
library(pryr)
options(width=120, scipen=10)
knitr::opts_chunk$set(echo = TRUE, cache = TRUE, warning = FALSE, 
                      message = FALSE, cache.lazy = FALSE)

```

<div style="line-height: 2em;">
<font size="4"> 

---

```{r cache=FALSE, eval=TRUE}
Time <- Sys.time()

# Switches & setting of variables

# Data set either "All8pc", " "All1pc", "Test", "Render"
analysis <-"Render"

if (analysis=="Render") {
  analysis <- readRDS("./Data/Analysis.rds")
}


# Save files "Y" or "N"
save <- "Y"

# Resold properties only
Resold_only <- "N"

# Region index 
Region_index <- "N"

# Update summary stats?
Dataset_start <- '1995-01-01' # includes this date
#Dataset_start <- '2005-04-01' # includes this date
Dataset_stop <- '2020-01-01' # excludes this date

# TUI of transactions used for plotting different reference prices:purchase, peak & neighbour. Datasets prepared in this rmd and plotted in Figures.Rmd
Rules_v <-  c("{00D34FC5-8B2D-4854-A6A7-49036F273611}", "{662017D1-AECE-41FA-A32D-0263DEBB96B1}", "{03BB0826-8B08-495D-9A78-0087566C8CD1}", "{6D59D05B-B9BE-4FC4-93B0-0919010749BF}")
# extra "{6D59D05B-B9BE-4FC4-93B0-0919010749BF}"
Rules <-  data.table(TUI = Rules_v)



cat("SWITCHES: DATA:", analysis, "  SAVE:", save)


```

# INDEX DATA

```{r cache=FALSE, eval=TRUE}
# HPI Index https://www.gov.uk/government/statistical-data-sets/uk-house-price-index-data-downloads-february-2020

HPIndex <- fread("./Data/UK-HPI-full-file-2020-03.csv")

# The March 2020 HPI omitted the 2018 indices for the combined BOURNEMOUTH CHRISTCHURCH AND POOLE district. As of 21/07/2020 an updated HPI has not been updated. Indices for the former separate districts of BOURNEMOUTH, CHRISTCHURCH AND POOLE were published in the January 2020 HPI.
HPIndexBCP <- fread("./Data/UK-HPI-full-file-2020-01.csv")
HPIndexBCP <- HPIndexBCP[RegionName %in% c("Bournemouth", "Christchurch", "Poole"),]
HPIndex <- rbind(HPIndex, HPIndexBCP)

# Index by month, district, property type
HPI <- HPIndex[, .(Date, District =toupper(RegionName), D = DetachedIndex, S = SemiDetachedIndex, T = TerracedIndex, F=  FlatIndex)]
HPI <- melt(setDT(HPI), id.vars = c("Date","District"), variable.name = "Type", value.name = "Index")
HPI <-HPI[, Date := as.Date(Date, "%d/%m/%Y")]


# Valuation by month, district, property type
HPV <- HPIndex[, .(Date, District =toupper(RegionName), D = DetachedPrice, S = SemiDetachedPrice, T = TerracedPrice, F=  FlatPrice)]
HPV <- melt(setDT(HPV), id.vars = c("Date","District"), variable.name = "Type", value.name = "Index")
HPV <-HPV[, Date := as.Date(Date, "%d/%m/%Y")] # Date in wrong format for fastPOSIXct? gives incorrect date 01/01/1995 -> 2006-06-18

```




# FUNCTIONS
```{r cache=FALSE, eval=TRUE}

#### Function to "lookup" House Price Index by month, UK region & property type
HPIindex1 <- function(dt, index, date, name) { 
  dt[, Date := as.Date(date, origin ="1970-01-01")] #  Date vector automatically converted to numeric so have to re-convert.
  setnames(index, old= "District", new="Region") 
  setkey(dt,Date,Region, Type)
  setkey(index,Date,Region,Type)
  dt <- merge(dt,index, all.x=TRUE) # just using merge does not work
  setnames(dt, old= "Index", new=deparse(substitute(name))) # Can't use get() (need quotes in function call) here or deparse(substitute()) no quotes in function call
  dt[, Date:=NULL] # remove unnecessary column
  setnames(index, old= "Region", new="District") 
  return(dt)
}


#### Function to "lookup" House Price Index by month, district & property type
HPIindex2 <- function(dt, index, date, name) { 
  dt[, Date := as.Date(date, origin ="1970-01-01")] #  Date vector automatically converted to numeric so have to re-convert.
  setkey(dt,Date,District, Type)
  setkey(index,Date,District,Type)
  dt <- merge(dt,index, all.x=TRUE) # just using merge does not work
  setnames(dt, old= "Index", new=deparse(substitute(name))) # Can't use get() (need quotes in function call) here or deparse(substitute()) no quotes in function call
  dt[, Date:=NULL] # remove unnecessary column
  return(dt)
}


#### Function to "lookup" House Price Index by month, region 
HPIindex3 <- function(DT, index, date, region, name) { 
  DT[, Date := as.Date(date, origin ="1970-01-01")] #  Date vector automatically converted to numeric so have to re-convert.
  index<- HPIndex[RegionName==region,.(Date,Index)][, Date := as.Date(Date, "%d/%m/%Y")] # Can't use HPI. Date in wrong format for fastPOSIXct? gives incorrect date 01/01/1995 -> 2006-06-18
  setkey(DT,Date)
  setkey(index,Date)
  DT <- merge(DT,index, all.x=TRUE) # just using merge does not work
  setnames(DT, old= "Index", new=deparse(substitute(name))) # Can't use get() (need quotes in function call) here or deparse(substitute()) no quotes in function call
  DT[, Date:=NULL] # remove unnecessary column
  return(DT)
}

 #### Function to index valuation and associated variables 
Indexed <- function(dt) {
 
dt <-HPIindex2(dt ,HPI,dt[,date.val],Idx_Val) # Index at valuation date

dt <-dt[,IdxPurchA_Val:=(Idx_Val-Idx_PurchA)/Idx_PurchA # Movement in index between purchase and valuation dates
            ][,IdxPriceA:=round(PriceA*(1+IdxPurchA_Val)) # Revaluation of property to valuation date
               ][,Unrealgain:=round(IdxPriceA-PriceA) # Calculates unrealised gain or loss - price paid vs index
                 ][, Quality:=PriceA - Val_PurchA # Noisy proxy for quality Genesove & Mayer
                   ][,UnrealgainYN:=Unrealgain >=0 # Unrealised gain (True), unrealised loss (False)
                     ][, DurationVal := (DoTA %--%date.val)/dyears(1) 
                       ][,SoldInPeriod := !(quarter(date.val, with_year = TRUE)!=(quarter(IndexDoTB, with_year = TRUE))|is.na(IndexDoTB))] # Sold for a profit in period? Re-order.

dt  <- dt[, `:=`(Quality100000 = Quality/100000,
                 PriceA100000 = PriceA/100000, 
                 sold.dummy =ifelse(SoldInPeriod,1,0))]

dt  <- dt[, UnrealgainAdj:= Unrealgain/100000 
                                  ]


return(dt)
}



Country.Index <- function(DT, index, start, stop, region) {
# Function to index purchase price by regio's index and determine regional unrealised loss or gain

region.name <- ifelse(region=="England and Wales", "E_and_W", region) 
 
  DT <-HPIindex3(DT,index,start,region, RegionIdx_PurchA)
  DT <-HPIindex3(DT,index,stop,region, RegionIdx_Val)
  
  
  DT <-DT[,RegionIdxPurchA_Val:=(RegionIdx_Val-RegionIdx_PurchA)/RegionIdx_PurchA # Movement in index between purchase and valuation dates
                                  ][,RegionIdxPriceA:=round(PriceA*(1+RegionIdxPurchA_Val)) # Revaluation of property to valuation date
                                    ][,RegionUnrealgain:=round(RegionIdxPriceA-PriceA) # Calculates unrealised gain or loss - price paid vs index
                                      ][,RegionUnrealgainYN:=RegionUnrealgain >=0 # Unrealised gain (True), unrealised loss (False)
                                        ][,Interaction_Region_District:=RegionUnrealgainYN*UnrealgainYN # 
                                          ][order(IndexDoTA, TUI, date.val),] # Unrealised gain (True), unrealised loss (False)
 

    setnames(DT, old= c("RegionIdx_PurchA","RegionIdx_Val","RegionIdxPurchA_Val","RegionIdxPriceA","RegionUnrealgain","RegionUnrealgainYN"), new=c(paste0(region,"_Idx_PurchA"),paste0(region.name,"_Idx_Val"),paste0(region.name,"_IdxPurchA_Val"),paste0(region.name,"_IdxPriceA"),paste0(region.name,"_Unrealgain"),paste0(region.name,"_UnrealgainYN")))    
    
    return(DT)
    
}

Region.Index <- function(DT, index) {
  
 DT <-HPIindex1(DT, index, DT[,IndexDoTA],RegionIdx_PurchA)
 DT <-HPIindex1(DT, index, DT[,date.val],RegionIdx_Val)
  
        DT[,RegionIdxPurchA_Val:=(RegionIdx_Val-RegionIdx_PurchA)/RegionIdx_PurchA # Movement in index between purchase and valuation dates
          ][,RegionIdxPriceA:=round(PriceA*(1+RegionIdxPurchA_Val)) # Revaluation of property to valuation date
            ][,RegionUnrealgain:=round(RegionIdxPriceA-PriceA) # Calculates unrealised gain or loss - price paid vs index
              ][,RegionUnrealgainYN:=RegionUnrealgain >=0][,Interaction_Region_District:=RegionUnrealgainYN*UnrealgainYN # 
                                          ][order(IndexDoTA, TUI, date.val),] # Unrealised gain (True), unrealised loss (False)
  
}


```

***TABLE A2


```{r cache=FALSE, eval=TRUE}

# DATA SET UP: ALL.CLEAN

# pp_complete: https://www.gov.uk/government/statistical-data-sets/price-paid-data-downloads

All.clean <- fread("./Data/pp-complete202003.csv", col.names = c("TUI", "PriceA", "DoTA", "Postcode", "Type", "New", "Tenure", "PAON", "SAON", "Street", "Locality", "Town_City", "Old_District", "County", "Category", "Status"))

# Include property transaction up to 31 Dec 2019 
All.clean <-All.clean[, DoTA := fastPOSIXct(DoTA, tz = NULL, required.components = 3L)][
  , DoTA := make_date(year(DoTA), month(DoTA), mday(DoTA))][
    DoTA < ymd(Dataset_stop),]

# Create unique ID for each property based on its address and type
setkey(All.clean, Postcode, PAON, SAON, Type)
All.clean <- All.clean[, Property_Id := .GRP, by = key(All.clean)][, Property_Id := as.character(Property_Id)][order(Property_Id, DoTA)]

sample_select <- data.table(Description = "Full dataset up to 31 December 2019", Transactions = as.character(nrow(All.clean)), Properties = length(unique(All.clean[, Property_Id])), Property_Quarters = " ")
# In England, at March 2017, making up the 23.9 million dwellings were 15.1 million owner occupied, 4.8 million private rented, 2.4 million rented from private registered providers (housing association) and 1.6 million rented from local authorities (Tables 2 and 3)
# The number of dwellings in Wales continues to grow steadily with an estimated 1.43 million dwellings at 31 March 2019, an increase of 5per cent over the last 10 years. 70 per cent of dwellings were owner occupied in 2019, this proportion has remained stable since 2012. The number of privately rented dwellings has risen in the last decade and accounted for 15 per cent of all dwellings at 31 March 2019.In 2019, the social sector accounted for 16 per cent of all dwellings, a,proportion that has remained stable over the past ten years. 10 per cent of all dwellings are owned by Registered Social Landlords and 6 per cent by local authorities. 


All.clean <-All.clean[Category=="A"]# exclude properties not Standard price paid entry
sample_select <- rbind(sample_select, data.table(Description = "Standard price paid entry", Transactions = nrow(All.clean), Properties = length(unique(All.clean[, Property_Id])), Property_Quarters = " "))

All.clean <-All.clean[PriceA>=250,]# exclude properties bought for less than £250
sample_select <- rbind(sample_select, data.table(Description = "Price at least £250", Transactions = nrow(All.clean), Properties = length(unique(All.clean[, Property_Id])), Property_Quarters = " "))


# Full address
All.clean <-All.clean[PAON!=""][(str_length(Postcode)>=6) & (str_length(Postcode)<=8)][order(Postcode, PAON, SAON),]# exclude properties with incomplete postcodes
Clean.District <- fread("./Data/Region_clean2003.csv", col.names = c("Old_District", "District", "Region", "Sub-region")) # Districts per PP database mapped to corrected admin districts. Admin districts change over time. HPI index is retrospectively updated but HMLR dataset isn't
# HPI dataset as at 
# set the ON clause as keys of the tables:
setkey(All.clean,Old_District)
setkey(Clean.District,Old_District)


# Maps original district names onto updated district names & regions 
All.clean  <- merge(All.clean,Clean.District[,.(Old_District, District, Region)], all.x=TRUE)
All.clean <- All.clean[order(Property_Id, DoTA)] 

All.clean <- All.clean[District !="N/A (split)",] 

sample_select <- rbind(sample_select, data.table(Description = "Complete address", Transactions = nrow(All.clean), Properties = length(unique(All.clean[, Property_Id])), Property_Quarters = " "))
```





```{r cache=FALSE, eval=TRUE}

# CREATE RESOLD & OBSERVATIONS BY QUARTER
# Transactions are ordered by address (Postcode, PAON, SAON) and date of purchase. A property is resold if the previous transaction has the same address.

Resold <- All.clean[, .(Property_Id, TUI, DoTA, PriceA, Type, New, Region, District, Postcode, PAON, SAON)][order(Property_Id, DoTA)]

# Exclude properties where only 1 transaction i.e. not resold?
 if (Resold_only=="Y") {
   
   Resold_ID <- Resold[, .N, by = Property_Id][N>1,] # Properties sold at least once.
   
   setkey(Resold_ID, Property_Id)
   setkey(Resold, Property_Id)
   Resold <- merge(Resold_ID[, .(Property_Id)],Resold, all.x=TRUE)
   
   sample_select <- rbind(sample_select, data.table(Description = "Relating to resold properties",  Transactions = nrow(Resold), Properties = length(unique(Resold[, Property_Id])), Property_Quarters = " "))

 }


# What's the Property Id for the transactions used in the rules plots
setkey(Resold, TUI)
setkey(Rules, TUI)
Rules <- Rules[Resold, nomatch=0][, .(Property_Id, TUI)]


Unique_properties <- unique(Resold[, Property_Id])


# Choose sample
# Do as early as possible because lead & lag take so long
 if (analysis=="All1pc") {
  mult <- 0.01
  } else if (analysis=="All5pc") {
  mult <- 0.05
  } else if (analysis=="All10pc") {
    mult <- 0.10
  } else if (analysis=="All15pc") {
    mult <- 0.15 
  } else if (analysis=="All1M") {
    mult <- 1000000 / length(Unique_properties)  
  } else if (analysis=="All20pc") {
    mult <- 0.20
  } else if (analysis=="Test") {
  mult <- 1000/14173433 # has to suffiently large so that factor labels works 
  } else if (analysis=="All") {
  mult <- 1 # has to suffiently large so that factor labels works 
}
  

Sample_property <- sample(Unique_properties, length(Unique_properties)*mult)
Resold_all <- Resold # Needed for "Extract the other variables for the Rules transactions"
Resold<-Resold[Property_Id %in% Sample_property, ]# Sample is based on observations not properties


# Are any of Rules transaction part of the sample. If so exclude, otherwise flag these non-sample additions?
setkey(Resold, TUI)
setkey(Rules, TUI)
Rules <- merge(Rules, Resold[,.(TUI, Type)], all.x=TRUE)
Rules <- Rules[is.na(Type)==TRUE, .(Property_Id, Sample=FALSE)]


# Extract the other variables for the Rules transactions
setkey(Rules, Property_Id)
setkey(Resold_all, Property_Id)
Rules <- Rules[Resold_all, nomatch=0]
Resold_all <- NULL

#  Flag the sample properties and combine with the Rules properties
Resold <-Resold[,Sample:=TRUE]

Resold <- rbind(Resold, Rules)


sample_select <- rbind(sample_select, data.table(Description = paste0("Sample: ", analysis),  Transactions = nrow(Resold[Sample==TRUE,]), Properties = length(unique(Resold[Sample==TRUE, Property_Id])), Property_Quarters = " "))


# HPI valuation date is first day of each month.
# Index at purchase date is taken as the index on 1st day of the month of purchase
# Index at sale date is taken as the index on 1st day of the month of sale

Resold <- Resold[order(Property_Id, DoTA)][, `:=`(PriceB = shift(PriceA, 1L, type="lead"), DoTB = shift(DoTA, 1L, type="lead")), by = .(Property_Id, Type)][, `:=`(DurationB = (DoTA %--%DoTB)/dyears(1), 
         GrossProfitB = PriceB-PriceA, 
         IndexDoTA = make_date(year(DoTA), month(DoTA), 01), 
         IndexDoTB = make_date(year(DoTB), month(DoTB), 01))]

#Resold<- Resold[, `:=`(Area = str_split((str_split(Postcode," ", simplify = TRUE)[,1]),"\\d", simplify = TRUE)[,1],
#                       Outward = str_split(Postcode," ", simplify = TRUE)[,1]),]



# Exclusion: used discretion to also exclude duration of less than a week
Resold <- Resold[DurationB>1/52|is.na(DurationB)=="TRUE",] # If NA implies property not resold so keep

sample_select <- rbind(sample_select, data.table(Description = "At least 1/52 year between successive purchases",  Transactions = nrow(Resold[Sample==TRUE,]), Properties = length(unique(Resold[Sample==TRUE, Property_Id])), Property_Quarters = " "))


# Valuation of comparable property at purchase and valuation index at purchaseo
Resold <- HPIindex2(Resold,HPV,Resold[,IndexDoTA],Val_PurchA) # Typical property price by district & type at purchase date 
 
Resold <- HPIindex2(Resold ,HPI,Resold[,IndexDoTA],Idx_PurchA) # Index at purchase date

Resold <- HPIindex2(Resold ,HPI,Dataset_stop,Idx_EndA)

Resold <- Resold[!is.na(Idx_PurchA),][,IdxPurchA_End:=(Idx_EndA-Idx_PurchA)/Idx_PurchA # Movement in index between purchase and sample end date (summary stats)
            ][,Val_EndA:=round(PriceA*(1+IdxPurchA_End)) # Revaluation of property to valuation date
               ]
#sample_select <- rbind(sample_select, data.table(Description = "Match to HPI dataset",  Transactions = nrow(Resold[Sample==TRUE,]), Properties = length(unique(Resold[Sample==TRUE, Property_Id])), Property_Quarters = " "))


# Create a table of Property IDs for every quarter from beginning to end of dataset
# date.val HPI Valuations are 1st day of month. Treat valuation 1995/04/01 as relating to 1995Q1
# Map property transactions onto this table, use roll join to c/f transactions
# If Property is sold more than once in the quarter choose final sale

quarters <- seq(ymd(Dataset_start),ymd(Dataset_stop),by='quarters')


Resold.indexed <- CJ(Property_Id=unique(Resold[, Property_Id]), date.val=quarters)

Resold.indexed <-Resold.indexed[, YrQtr :=  as.character(paste0(quarter(ymd(date.val)-1, with_year = TRUE), "Q"))]# date.val HPI Valuations are 1st day of month. Treat valuation 1995/04/01 as relating to 1995Q1

Resold <- Resold[, YrQtr := as.character(paste0(quarter(DoTA, with_year = TRUE), "Q"))][order(Property_Id, DoTA)]

setkey(Resold.indexed, Property_Id, YrQtr)
setkey(Resold, Property_Id, YrQtr)

Resold.indexed <- Resold[Resold.indexed, roll=TRUE][
  , .SD[.N], by = .(Property_Id, YrQtr)][ # If Property is sold more than once in the quarter choose the last sale
  is.na(PriceA)==FALSE, ][ # Removes quarters before property is purchased
    order(Property_Id, date.val)]



# Determine indexed values and associated variables

Resold.indexed <- Indexed(Resold.indexed)
Resold.indexed <- Resold.indexed[is.na(Idx_Val)==FALSE,]



sample_select <- rbind(sample_select, data.table(Description = "Quarterly valuation",  Transactions = length(unique(Resold.indexed[Sample==TRUE,TUI])), Properties = length(unique(Resold.indexed[Sample==TRUE, Property_Id])), Property_Quarters = nrow(Resold.indexed[Sample==TRUE,])))


if (Region_index =="Y") {
  # Needs to be before Correlation prepare because Type of property changes from single letter to description
  # Need to update sample_select
  #Resold.indexed <- Country.Index(Resold.indexed, HPIndex, Resold.indexed[,IndexDoTA], Resold.indexed[,date.val],"England and Wales")
  
  Resold.indexed <- Region.Index(Resold.indexed, HPI) 
  
  Resold.indexed <- Resold.indexed[is.na(RegionIdx_PurchA)!="TRUE",] #one property in district Wealden was NA?
  

}


Resold.indexed  <- Resold.indexed[, `:=`(New = factor(New, labels = c("Old", "New-build")),
                                         Type = factor(Type, labels =c("Detached","Flat","Semi","Terraced")))][
                                           , Type:=relevel(Type,"Flat")]


# Flag first observation in each TUI group
First <- Resold.indexed[order(TUI, date.val)][, .SD[1], by = TUI][, .(TUI, date.val, First_TUI=TRUE)]
setkey(Resold.indexed, TUI, date.val)
setkey(First, TUI, date.val)
Resold.indexed <- merge(Resold.indexed, First, all.x=TRUE)
Resold.indexed <- Resold.indexed[, First_TUI := ifelse(is.na(First_TUI)==TRUE, FALSE, First_TUI)]

# Flag first valuation in each TUI group
First2 <- Resold.indexed[First_TUI ==TRUE, .(TUI, date.val, First_val = date.val)]
setkey(Resold.indexed, TUI, date.val)
setkey(First2, TUI, date.val)
Resold.indexed <- First2[Resold.indexed, roll=TRUE]

# See Last_date_N below


```



```{r cache=FALSE, eval=FALSE}

Resold.indexed  <- Resold.indexed[, `:=`(Shift1_Val = shift(IdxPriceA, n=1L),
                                         Shift2_Val = shift(IdxPriceA, n=2L),
                                         Shift1_Date = shift(date.val, n=1L),
                                         Shift2_Date = shift(date.val, n=2L))
                                   ][, `:=`(Pre1_Val = ifelse(First_TUI==FALSE, Shift1_Val, IdxPriceA),
                                            Pre1_Date = as.Date(ifelse(First_TUI==FALSE, Shift1_Date, date.val), origin = "1970-01-01"),
                                            Pre2_Val = ifelse(is.na(Shift2_Val)==FALSE, Shift2_Val, ifelse(First_TUI==FALSE, shift(IdxPriceA, n=1L), IdxPriceA)),
                                            Pre2_Date = as.Date(ifelse(is.na(Shift2_Val)==FALSE, Shift2_Date, ifelse(First_TUI==FALSE, shift(date.val, n=1L), date.val)), origin = "1970-01-01"))
                                           ][, `:=`(Pre1_Dur = (Pre1_Date %--%date.val)/dyears(1)
                                                    , Pre2_Dur = (Pre2_Date %--%date.val)/dyears(1))]


```




***


#  Create Peak price variables & plot

```{r cache=FALSE, eval=TRUE}

# Peak_date_start - date of the peak, start of the 3Q test period
# Peak_date / date.val - End of the 3q test period, peak becomes new reference price
# New_peak_date - Peak stops being reference price iff new peak starts at the end of successful 3Q testing period
# Peak_date_end_NA iff new peak starts at the beginning of test period or no reference point during successful testing period. If not the last peak price than the date of next peak price otherwise the last date 

# Peak - 3Q testing period, peak becomes new ref point at the end of successful testing period
# Peak2 - 3Q testing period, peak becomes new ref point at the beginning of testing period
# Peak3 - 3Q testing period, peak becomes new ref point at the end of testing period, no reference point during (successful) testing period.


Resold.indexed.subset  <- Resold.indexed[,.(Property_Id, TUI, First_TUI, date.val, PriceA, IdxPriceA)
                                         ][, PriceA.n := as.numeric(PriceA) # Need to convert PriceA from integer to dbl otherwise conditions ifelse == doesn't work.
                                           ][, IdxAdj := ifelse(First_TUI==TRUE, PriceA.n, IdxPriceA)] # when determining the running maximum, need the first entry in each TUI group to be the higher of purchase price or first valuation
 

# Initial Peak_price is the purchase price, it must stay the peak price for at least 3 quarters because of the 3 quarter test period
# Did the maximum valuation for each observation x at time t first arise 3 quarters ago, if so x at t-3 is a new maximum otherwise there is no new peak (NA)
Resold.indexed.subset  <- setDT(Resold.indexed.subset)[, Index_price_max := cummax(IdxAdj), by = TUI][
  , Peak_date_start := as.Date(ifelse(First_TUI==TRUE, date.val, shift(date.val, n=3L)), origin = "1970-01-01"), by = TUI][
    , `:=`(Peak_price = ifelse(First_TUI==TRUE, PriceA.n, ifelse(Index_price_max==shift(IdxPriceA, n=3L), shift(IdxPriceA, n=3L), NA)),
           Last_date = max(date.val)), by = TUI]

#### Change rule so that peak must be a peak part 1) initial peak price is therefore not a purchase price
Resold.indexed.subset <- Resold.indexed.subset[Peak_price==PriceA.n, `:=`(Peak_price = NA, Peak_date_start=NA)]




# Extract only the peak prices
#  For plotting need start & end of test period and start and end of  perios peak is a reference price
# Need the date each peak ends, when the property is owned by the same person that's the start date of the next peak, or last valuation for that owner, or last valuation for that dataset
# Peak_prices is later in the Rmd fwritten to a .csv file and used in plotting in the Figures.Rmd

Peak_prices <- Resold.indexed.subset[is.na(Peak_price)==FALSE,.(TUI, Peak_price, Peak_date_start, date.val, Last_date, Purchase_price=PriceA)]

Last <- Peak_prices[order(TUI, date.val)][, .SD[.N], by = TUI][, .(TUI, date.val, Last =TRUE)]
setkey(Peak_prices, TUI, date.val)
setkey(Last, TUI, date.val)
Peak_prices <- merge(Peak_prices, Last, all.x=TRUE)
Peak_prices <- Peak_prices[, Last := ifelse(is.na(Last)==TRUE, FALSE, Last)]

Peak_prices <- Peak_prices[,`:=`(New_peak_date = ifelse(Last == FALSE, shift(date.val, n=1L, type = "lead"), Last_date),
                                 Peak_date_end_NA = ifelse(Last == FALSE, shift(Peak_date_start, n=1L, type = "lead"), Last_date)), by = TUI][
                                   , `:=`(New_peak_date = as.Date(New_peak_date, origin = "1970-01-01"),
                                          Peak_date_end_NA = as.Date(Peak_date_end_NA, origin = "1970-01-01"),
                                          Last_date=NULL)]

Peak_prices.subset <- Peak_prices[,.(TUI, date.val, Peak_price, Peak_date= date.val)]

# set the ON clause as keys of the tables:
setkey(Resold.indexed, "TUI", "date.val")
setkey(Peak_prices.subset, "TUI", "date.val")
Resold.indexed <- Peak_prices.subset[Resold.indexed, roll=TRUE]


Peak_prices.subset <- Peak_prices[,.(TUI, date.val=Peak_date_start, Peak_price2=Peak_price, Peak_date_start)]
Resold.indexed <- Resold.indexed[order(TUI, date.val)][, id:= 1:.N] # Tiny number of duplicates were introduced in below merge,  so 1 is removed (see part 2 below)

# set the ON clause as keys of the tables:
setkey(Resold.indexed, "TUI", "date.val")
setkey(Peak_prices.subset, "TUI", "date.val")
Resold.indexed <- Peak_prices.subset[Resold.indexed, roll=TRUE][, .SD[1], by = .(id)][, id:=NULL] # Tiny number of duplicates were introduced in below merge,  so 1 is removed (see part 1 below)


sample_select <- rbind(sample_select, data.table(Description = "Both peak and purchase price present",  Transactions = length(unique(Resold.indexed[!is.na(Peak_price),][Sample==TRUE,TUI])), Properties = length(unique(Resold.indexed[!is.na(Peak_price),][Sample==TRUE, Property_Id])), Property_Quarters = nrow(Resold.indexed[!is.na(Peak_price),][Sample==TRUE,])))


Resold.indexed <-Resold.indexed[,Peak.unrealgain:=round(IdxPriceA-Peak_price) # Calculates unrealised gain or loss - price paid vs index
                                ][,Peak.unrealgainYN:=Peak.unrealgain >=0 # Unrealised gain (True), unrealised loss (False)
                                  ]

Resold.indexed  <- Resold.indexed[, Peak.unrealgainAdj:= Peak.unrealgain/100000 
                                  ][, `:=`(Peak.unrealgain.pos = ifelse(Peak.unrealgainYN,Peak.unrealgainAdj,0),
                                           Peak.unrealgain.neg = ifelse(!Peak.unrealgainYN,Peak.unrealgainAdj,0))
                                    ][, `:=`(Peak.unrealgain.pos2 = Peak.unrealgain.pos^2,
                                             Peak.unrealgain.neg2 = Peak.unrealgain.neg^2,
                                             DurationPeak = (Peak_date %--%date.val)/dyears(1) )]

Resold.indexed  <- Resold.indexed[, Peak_price3 := ifelse(Peak_price==Peak_price2, Peak_price, NA)
                                  ][, `:=`(Peak2.unrealgainAdj=round(IdxPriceA-Peak_price2)/100000,
                                           Peak3.unrealgainAdj=round(IdxPriceA-Peak_price3)/100000,
                                           DurationPeak2 = (Peak_date_start %--%date.val)/dyears(1),
                                           DurationPeak3 = ifelse(is.na(Peak_price3)==TRUE, NA, (Peak_date %--%date.val)/dyears(1)))]

```


#  Create nosy-neighbour reference points

```{r cache=FALSE, eval=FALSE}

All.clean <-All.clean[, Type := factor(Type, labels =c("Detached","Flat","Semi","Terraced"))][, New := factor(New, labels = c("Old", "New-build"))]

When.sold.all <- All.clean[,.(Postcode, Neighbour.DoTA=DoTA, Neighbour.price = PriceA, Type, Neighbour.PAON=PAON, Neighbour.SAON=SAON)][
  ,YrQtr:=paste0(quarter(Neighbour.DoTA, with_year = TRUE), "Q")][
    ,.(Neighbour.DoTA, YrQtr, Neighbour.price, Type, Postcode,  Neighbour.PAON, Neighbour.SAON)]


# For resold properties want to determine for each quarter that a neighbouring property is sold the valuation of property A (Neighbour.valuation = IdxPriceA) This is taken from Resold.indexed 
# and the sales price of the neighbouring property (Neighbour.price = PriceA) taken from When.sold 
# However want reference price of first observation of each transaction to be purchase price of property A, so remove first row then bring it back (don't want to double up with valuation of another property also purchased that quarter)
# Date matching: YrQtr of purchase from When.sold with YrQtr of valuation
# (Don't group by New since when resold will no longer be new)
Resold.indexed <- Resold.indexed[order(TUI, date.val)][, Last_date_N := tail(date.val,1), by = TUI]
Resold.indexed.subset <- Resold.indexed[order(TUI, date.val)][First_TUI==FALSE,.(TUI, YrQtr, Neighbour.valuation=IdxPriceA, Neighbour.reference.date=date.val, Last_date_N,  Postcode, Type, PAON, SAON, DoTB, First_TUI)]


# Match property to others sold in postcode of same type. Don't match property to itself (same PAON & SAON). Purchase of neighbour must be before sale of target property.If more than one sale than match to the highest price.  
setkey(When.sold.all,Postcode, YrQtr, Type)
setkey(Resold.indexed.subset, Postcode, YrQtr, Type)


When.sold <- When.sold.all[Resold.indexed.subset, nomatch=0][
  Neighbour.DoTA<DoTB | is.na(DoTB)==TRUE,][
    Neighbour.PAON!=PAON | Neighbour.SAON!=SAON,][order(TUI, YrQtr, -Neighbour.price)][, .SD[1], by = .(Postcode, YrQtr, Type)][
      , `:=`(Neighbour.reference=ifelse(Neighbour.price>Neighbour.valuation, Neighbour.price, Neighbour.valuation))][
        , .(TUI, YrQtr, Neighbour.valuation, Neighbour.price, Neighbour.reference, Neighbour.reference.date, Last_date_N, First_TUI)]

# As above However want reference price of first observation .... Also need purchase price as reference point
Resold.indexed.subset <- Resold.indexed[First_TUI==TRUE,.(TUI, YrQtr, Neighbour.valuation=PriceA, Neighbour.reference.date=date.val, Last_date_N, Neighbour.price = PriceA, Neighbour.reference=PriceA, First_TUI)][order(TUI, Neighbour.reference.date)]
When.sold  <- rbind(Resold.indexed.subset, When.sold)

# Only integer, double or character colums may be roll joined
When.sold <- When.sold[,YrQtr:= as.character(YrQtr)]
Resold.indexed <- Resold.indexed[,YrQtr:= as.character(YrQtr)]
Resold.indexed <- Resold.indexed[order(TUI, date.val)][, id:= 1:.N] # Tiny number of duplicates were introduced in below merge, they seemed identical, so 1 is removed (see part 2 below)

setkey(When.sold,TUI, YrQtr)
setkey(Resold.indexed, TUI, YrQtr)

Resold.indexed <- When.sold[Resold.indexed[, Last_date_N:=NULL], roll=TRUE][, .SD[1], by = .(id)][, id:=NULL]
# Otherwise Last_date_N duplicated
# Tiny number of duplicates were introduced in above merge, they seemed identical, so 1 is removed (see part 1 above)


Resold.indexed <- Resold.indexed[, `:=`(Neighbour.price.unrealgainAdj=round(IdxPriceA-Neighbour.price)/100000,
                                        Neighbour.valuation.unrealgainAdj=round(IdxPriceA-Neighbour.valuation)/100000,
                                        Neighbour.reference.unrealgainAdj=round(IdxPriceA-Neighbour.reference)/100000,
                                        Duration.neighbour = (Neighbour.reference.date %--%date.val)/dyears(1))
                                 ][, Neighbour.valuation.unrealgainYN:=Neighbour.valuation.unrealgainAdj >=0]



# When.sold is used in plotting neighbour rules. Neighbour_plot function
When.sold <- When.sold[,Neighbour.date.end := as.Date(ifelse(is.na(shift(Neighbour.reference.date, n=1L, type = "lead"))== FALSE, shift(Neighbour.reference.date, n=1L, type = "lead"), Last_date_N), origin ="1970-01-01"), by = TUI]



```



#  Median split

## Number of transactions


## Number of transactions by district, quarter and type ... DO YOU WANT TO DO THIS?????

```{r cache=FALSE, eval=TRUE}

# Number of transactions by district, quarter and type

number_transactions <-All.clean[,.(DoTA, District)][
  , YrQtr:=paste0(quarter(DoTA, with_year = TRUE), "Q")][District!="#N/A",.N, by = .(YrQtr, District)][(order(YrQtr, District))]

setnames(number_transactions, "N", "No_transactions") 


setkey(Resold.indexed, YrQtr, District)
setkey(number_transactions, YrQtr, District)

# perform the join, eliminating not matched rows from Right
Resold.indexed <-merge(Resold.indexed, number_transactions, all.x=TRUE)


Resold.indexed <- Resold.indexed[, No_transactions:= as.numeric(No_transactions)][
    , No_transactions:= ifelse(is.na(No_transactions)==TRUE,0,No_transactions)][# Valuing every quarter, if no transactions in district then, replaces NA with 0
      , Transactions_band := factor(ifelse(No_transactions==median(No_transactions), "Equals median", ifelse(No_transactions>median(No_transactions), "High", "Low")))]

```




## Housing stock

```{r cache=FALSE, eval=TRUE}
# Housing stock England https://www.gov.uk/government/statistical-data-sets/live-tables-on-dwelling-stock-including-vacants
# Working file directory: ⁨Google Drive⁩ ▸ ⁨Documents⁩ ▸ ⁨Research⁩ ▸ ⁨Housing⁩ ▸ ⁨Disposition effect ⁩ ▸ ⁨Data⁩
# Working file: Stock LT125.xlsx

# Housing stock Wales (?)  https://statswales.gov.wales/Catalogue/Housing/Dwelling-Stock-Estimates/dwellingstockestimates-by-localauthority-tenure
# Same working file directory as England
# Working file: Stock Wales_200103.xlsx


# For English dwellings names of districts had to be brought up to date

dwelling_stock <- fread("./Data/Table_125_Dwelling_stock_estimates.csv", col.names = c("Id", paste0("Delete", 1:4), "District", paste0("Delete", 5:6), 2001:2018), header = FALSE)
dwelling_stock <- dwelling_stock[Delete6!="County" & is.na(Id)==FALSE,][, `:=`(Delete1=NULL, Delete2=NULL, Delete3=NULL, Delete4=NULL, Delete5=NULL, Delete6=NULL)]

dwelling_stock_Wales <- fread("./Data/Stock_Wales.csv", col.names = c("Id", "District", 2001:2019), header = FALSE)
dwelling_stock_Wales <- dwelling_stock_Wales[, `2019`:= NULL] # Currently English data set doesn't have 2019

dwelling_stock <- rbind(dwelling_stock, dwelling_stock_Wales)
dwelling_stock <- melt(dwelling_stock, id.vars = c("Id", "District"),  variable.name = "Year", value.name = "Stock")


# csv file includes ".." where no value; numbers have commas that need to be removed before character vector can be coerced into integer
# Some district names have changed, so districts cn have the same district names
dwelling_stock <- dwelling_stock[, Stock:= ifelse(Stock=="..", 0, Stock)][
  , Stock := as.integer(gsub(",", "", Stock, fixed = TRUE)) ][
    , .(`1Q`=as.numeric(sum(Stock))), by =.(District, Year)][order(District, Year)]

# data is annual ('1Q') Assume housing stock changes linearly between 1 year & next
dwelling_stock <- dwelling_stock[, `:=`(`2Q` = 0.75*`1Q`+0.25*shift(`1Q`, n=1L, type="lead"),
                                        `3Q` = 0.5*`1Q`+0.5*shift(`1Q`, n=1L, type="lead"),
                                        `4Q` = 0.25*`1Q`+0.75*shift(`1Q`, n=1L, type="lead")), by=.(District)]

dwelling_stock <- melt(dwelling_stock, id.vars = c("District", "Year"),  variable.name = "YrQtr", value.name = "Stock")

# Get NA at the end of the series of years
dwelling_stock <- dwelling_stock[, YrQtr:=paste(Year, YrQtr, sep = ".")][is.na(Stock)==FALSE,.(District, YrQtr, Stock)]



  setkey(Resold.indexed, YrQtr, District)
  setkey(dwelling_stock, YrQtr, District)
  
  # perform the join, eliminating not matched rows from Right
Resold.indexed <- merge(Resold.indexed, dwelling_stock, all.x=TRUE)
  
Resold.indexed <- Resold.indexed[, Stock_band := ifelse(Stock==median(Stock, na.rm = TRUE), "Equals median", ifelse(Stock>median(Stock, na.rm = TRUE), "High", "Low"))]


```

```{r cache=FALSE, eval=TRUE}

# LTV uses an approach taken from a study by Cloyne et al., (2019). Data is FCA Private dataset based on Mortgage Lenders & Administrators Return: Table 3.5 Median loan-to-value (LTV) ratios (%) Quarterly Geographic Breakdown (see psd-mortgages-2019.xlsx, tab Quarterly Geographic Breakdown)
# filtered for re-mortgagors only and going as far back as they have data .
# Cloyne, J., Huber, K., Ilzetzki, E. and Kleven, H., 2019. The effect of house prices on household borrowing: a new approach. American Economic Review, 109(6), pp.2104-36

LTV.w  <- fread("./Data/PSD Mortgages - Borrower Type Remortgagors LTV a790df422.csv", skip=1)
setnames(LTV.w, old= "Median LTV", new="Region_old")

LTV <-  melt(LTV.w, id.vars = "Region_old", variable.name = "YrQtr_old", value.name = "LTV_index")

LTV <- LTV[is.na(LTV_index)==FALSE,
           ][,YrQtr:=str_replace_all(YrQtr_old, c(" Q1" = ".1Q", " Q2" = ".2Q", " Q3" = ".3Q", " Q4" = ".4Q"))
             ][, Region := toupper(str_replace_all(Region_old, c("Central & Greater London" = "London", "Eastern" = "East of England")))
               ][, .(Region, YrQtr, LTV_index)][order(Region, YrQtr)]



setkey(Resold.indexed, YrQtr, Region)
setkey(LTV, YrQtr, Region)

# perform the join, eliminating not matched rows from Right
Resold.indexed <-merge(Resold.indexed, LTV, all.x=TRUE)

Resold.indexed <- Resold.indexed[, LTV_band := factor(ifelse(LTV_index==median(LTV_index, na.rm = TRUE), "Equals median", ifelse(LTV_index>median(LTV_index, na.rm = TRUE), "High", "Low")))]

```



***

```{r cache=FALSE, eval=TRUE, echo=FALSE}



Rules <-  Resold.indexed[TUI %in% Rules_v,.(IdxPriceA, PriceA, Peak_price,  Peak_price3, TUI, Peak_date_start, DoTA, date.val)]


Resold.indexed<-Resold.indexed[Sample=="TRUE",.(Property_Id, Type, New, Quality100000, Region, date.val, YrQtr, sold.dummy, IdxPriceA, PriceA, DurationVal, DurationB, Peak_date_start, Peak_price, DurationPeak, Peak_price2, DurationPeak2, Peak_price3, DurationPeak3, Transactions_band, Stock_band, LTV_band, TUI, Val_EndA, GrossProfitB)][order(Property_Id, date.val)]


sample_select <- sample_select[, `:=`(Purchases_pc = format(round(100*as.numeric(Transactions)/max(as.numeric(Transactions)),1),nsmall = 1),
                                      Properties_pc = format(round(100*as.numeric(Properties)/max(as.numeric(Properties)),1),nsmall = 1))
                               ][,.(Description, Purchases=Transactions, Purchases_pc, Properties, Properties_pc, Property_Quarters)]



if (save == "Y") {
  fwrite(Resold.indexed, paste0("./Data/Resold.indexed.", analysis, ".csv"))
  fwrite(Rules, "./Data/Rules.csv")
  #fwrite(When.sold, paste0("./Data/When.sold.", analysis, ".csv"))
  fwrite(Peak_prices, paste0("./Data/Peak_prices.", analysis, ".csv"))
  fwrite(sample_select, paste0("./Tables/sample_select", analysis, ".csv"))
  }

Sys.time() - Time

```




</div>
</font> 